home *** CD-ROM | disk | FTP | other *** search
- #!/usr/bin/perl -w
- # Script Version 1.2
- ###############################################################
- # Welcome to the Link Checker script. #
- # The rules are simple: I would love to have my coding fixed. #
- # Send emails to <braxton@braxtech.com> #
- # please don't pick nits though. #
- ###############################################################
- # Licensed under the BSPL: #
- # 1) If it's immoral, discrediting, or just downright #
- # insulting don't do it. #
- # 2) If in doubt, email me. #
- # Take care, and happy scripting, Braxton Sherouse. #
- ###############################################################
-
- use strict;
-
- # Functions
- sub doCollectLinks($);
- sub doCheckLinks(@);
- sub URLExists($);
- sub fileExists($);
- sub combinePaths($$$);
- sub printResult($$$$);
- sub mortallyWounded($$$);
- sub debugMessage($$$);
-
- sub isURL($);
- sub isRecursableFile($);
- sub isRecursableURL($);
-
- ## Default Values for Arguments
- my $verbose=0;
- my $postponeURLs=1;
- my $easyparse=0;
- my $connecttimeout=".5";
- my $totaltimeout="5";
- my $userAgent="BLT(LinkChecker0.2)";
- my $recurse=0;
- my $skipComments=0;
- my $showEmailLinks=0;
- my $showLineNumbers=0;
- my @defaultFiles; #given default values later.
-
- ## Global Constants
- my @args=@ARGV;
- my $DEBUG=0;
- my $userhome=$ENV{HOME};
- my %linkStatus=(failure =>0,
- success =>1,
- email =>2,
- protocolError =>3,
- forbidden =>4,
- timeout =>5,
- anchor =>6);
-
- ## Global Declarations
- my $rootdir;
- my $startloc;
- my %URLResultCache;
- my %forwardingURLs;
- my @processedlinks;
- my @postponedURLs;
-
-
- parseArgs();
- if ($DEBUG)
- {
- debugMessage("userhome","$userhome",__LINE__);
- debugMessage("rootdir","$rootdir",__LINE__);
- debugMessage("startloc","$startloc",__LINE__);
- debugMessage("postpone","$postponeURLs",__LINE__);
- debugMessage("easyparse","$easyparse",__LINE__);
- debugMessage("connecttimeout","$connecttimeout",__LINE__);
- debugMessage("totaltimeout","$totaltimeout",__LINE__);
- debugMessage("recurse","$recurse",__LINE__);
- debugMessage("useragent","$userAgent",__LINE__);
- }
-
- if ((isURL($startloc) && (URLExists($startloc)==$linkStatus{'success'})) || (!isURL($startloc) && fileExists($startloc)))
- {
- print "::: v :: rootdir :: $rootdir :: __LINE__ :::\n" if ($easyparse && $showLineNumbers);
- doCheckLinks(doCollectLinks($startloc));
- checkPostponedURLs();
- }
- else
- {mortallyWounded("starting location","doesn't exist",__LINE__);}
-
- sub doCollectLinks($){
- my $page=shift;
- my $work;
- my @temparray;
- my $baseurl="";
- my $in;
-
- debugMessage("doCollectLinks(\$)","$page",__LINE__) if $DEBUG;
-
- while ($forwardingURLs{$page})
- {$page=$forwardingURLs{$page};}
-
- if (isURL($page))
- {$in="curl --connect-timeout $connecttimeout -A \"$userAgent\" --max-time $totaltimeout \"$page\" -s |";}
- else
- {$in=$page;}
-
- open IN, $in or mortallyWounded("doCollectLinks","file disappeared: $page",__LINE__);
- open OUT, ">$userhome/.tempout.txt" or mortallyWounded("doCollectLinks","couldn't create ~/.tempout.txt",__LINE__);
-
- while (<IN>) {$work.=$_;}
- $work=~s|<!--(.*?)-->||gs if $skipComments;
- print OUT $work;
- close OUT;
- close IN;
-
- open IN, "$userhome/.tempout.txt" or mortallyWounded("doCollectLinks","couldn't open ~/.tempout.txt",__LINE__);
- while (<IN>)
- {
- while (/<([^>]*)\s+(href|src|background)\s*=\s*(\"|\')(\S*)(\"|\')[^>]*>/gmi)
- {
- my $tagname=$1;
- my $reftype=$2;
- my $content=$4;
- if ($content!~/^javascript:/)
- {
- if ($baseurl)
- {push @temparray, {url=>combinePaths($content,$baseurl,0),line=>$.,hasbase=>$baseurl};}
- else
- {push @temparray, {url=>$content,line=>$.};}
- }
- if ($tagname=~/^base/i && $reftype=~/^href$/i)
- {$baseurl=$content;}
- }
- }
- close IN;
-
- unlink "$userhome/.tempout.txt";
-
- push @temparray, "$page"; #it's the first thing to get popped later.
- return @temparray;
- }
-
- sub doCheckLinks(@){
- my @linkarray=@_;
- my $from=pop @linkarray;
-
- debugMessage("doCheckLinks(\@)","$from",__LINE__) if $DEBUG;
-
- if (!grep /^$from$/i, @processedlinks)
- {
- push @processedlinks, $from;
- if (isURL($from) && $from=~m#/$#)
- {
- foreach my $deffile (@defaultFiles)
- {
- if (!grep /^$from$deffile$/i,@processedlinks)
- {push @processedlinks,"$from$deffile";}
- }
- }
- for (my $a=0;$a<scalar @linkarray;$a++)
- {
- my $cururl=$linkarray[$a]{'url'};
- my $hasbase=$linkarray[$a]{'hasbase'};
- my $lineNumber=$linkarray[$a]{'line'};
- my $combinedPath=combinePaths($cururl,$from,$hasbase);
-
- if (isURL($cururl) && $cururl!~/^$rootdir/i)
- {
- if ($postponeURLs)
- {push @postponedURLs, {url=>$cururl,line=>$lineNumber,from=>$from};}
- else
- {printResult(URLExists($combinedPath),$combinedPath,$from,$lineNumber);}
- }
- elsif ($cururl=~/mailto:(.*)/i)
- {printResult($linkStatus{"email"},$1,$from,$lineNumber);}
- elsif (!isURL($cururl) && $cururl=~m#(\w{2,6}://.*)#)
- {printResult($linkStatus{"protocolError"},$1,$from,$lineNumber);}
- elsif (!isURL($startloc) and !isURL($cururl))
- {
- my $success=fileExists($combinedPath);
- printResult($success,$combinedPath,$from,$lineNumber);
- if ($success && $recurse && $combinedPath=~/^$rootdir/ && isRecursableFile($combinedPath))
- {doCheckLinks(doCollectLinks(replaceDirWithDefaultFile($combinedPath)));}
- }
- elsif(isURL($combinedPath))
- {
- my $success=URLExists($combinedPath);
- printResult($success,$combinedPath,$from,$lineNumber);
- if ($success && $recurse && $combinedPath=~/^$rootdir/ && isRecursableURL($combinedPath))
- {doCheckLinks(doCollectLinks("$combinedPath"));}
- }
- }
- }
- else {}
- }
-
-
- sub URLExists($){
- my $url=shift;
- my $returnValue=$linkStatus{"failure"}; # default to failure. just to be safe.
-
- if ($url=~/#/)
- {return $linkStatus{"anchor"};}
-
- debugMessage("URLExists(\$)","$url",__LINE__) if $DEBUG;
-
- if ($URLResultCache{$url})
- {return $URLResultCache{$url};}
- else
- {
- $url=~m#(https?|ftp)://(.*)#i;
- my $method=lc($1);
-
- #### eventually this should use --referer <URL> to send the referrer to the server.
- open CURL, "curl -I --connect-timeout $connecttimeout -A \"$userAgent\" --max-time $totaltimeout \"$url\" -s -S --stderr - |" or mortallyWounded("URLExists","couldn't fork curl",__LINE__);
-
- while (<CURL>)
- {
- if (/^curl: \((\d+)\)/)
- {
- my $errorCode=$1;
- debugMessage("curlSaid",$errorCode,__LINE__) if $DEBUG;
-
- if ($errorCode==1) {$returnValue=$linkStatus{'protocolError'};}
- elsif ($errorCode>1 && $errorCode<9) {$returnValue=$linkStatus{'failure'};}
- elsif ($errorCode>8 && $errorCode<13) {$returnValue=$linkStatus{'forbidden'};}
- elsif ($errorCode==28) {$returnValue=$linkStatus{'timeout'};}
- last;
- }
- elsif ($method eq "http" and m|HTTP/\d\.\d (\d{3})\s?.*|i)
- {
- my $errorCode=$1;
- debugMessage("webServerSaid",$errorCode,__LINE__) if $DEBUG;
-
- if ($errorCode eq "405")
- {
- # server gives "method not allowed" error.
- # Use GET to download the entire document... argh.
- system ("curl --connect-timeout $connecttimeout -L -A \"$userAgent\" --max-time $totaltimeout $url -s > \"$userhome/.tempoutcurl.txt\"");
- do {$returnValue=$linkStatus{'success'}; unlink "$userhome/.tempoutcurl.txt";} if (-s "$userhome/.tempoutcurl.txt");
- }
- elsif($errorCode=~/^2/)
- {$returnValue=$linkStatus{"success"};}
- elsif($errorCode=~/^3/)
- {
- do {$_=<CURL>;} until ($_=~/Location: (.*)/);
- $forwardingURLs{$url}=$1;
- $returnValue=URLExists($1);
- }
- elsif ($errorCode=~/^40(1|3)/)
- {
- $returnValue=$linkStatus{"forbidden"};
- }
- last;
- }
- elsif ($1 eq "ftp" and /Content-Length: (\d{0,20})/)
- {
- $returnValue=$linkStatus{"success"};
- last;
- }
- }
- close CURL;
-
-
-
- $URLResultCache{$url}=$returnValue;
-
- if ($url=~m#/$#)
- {
- foreach my $deffile (@defaultFiles)
- {
- if (!$URLResultCache{"$url$deffile"})
- {$URLResultCache{"$url$deffile"}=$returnValue;}
- }
- }
- else
- {
- # check to see if it is a default file.
- # if it is, add the directory.
- $url=~m#(.+/)(.*)$#;
- if (grep /^$2$/i,@defaultFiles)
- {$URLResultCache{"$1"}=$returnValue;}
- }
- return $returnValue;
- }
- }
-
-
- sub fileExists($){
- my $file=shift;
-
- debugMessage("fileExists(\$)","$file",__LINE__) if $DEBUG;
-
- if ($file=~/#/)
- {return $linkStatus{"anchor"};}
-
- $file=replaceDirWithDefaultFile($file);
-
- if (-e "$file")
- {return $linkStatus{"success"};}
- else
- {return $linkStatus{"failure"};}
- }
-
-
-
- sub checkPostponedURLs{
-
- debugMessage("checkPostponedURLs()","()",__LINE__) if $DEBUG;
-
- if ($verbose)
- {
- if (!$easyparse)
- {print " notice : switching to external links\n" if (scalar @postponedURLs>=1);}
- else
- {print "::: n :: switch :: external :: x :::\n" if (scalar @postponedURLs>=1);}
- }
-
- for(my $a=0;$a<scalar @postponedURLs;$a++)
- {
- printResult(URLExists($postponedURLs[$a]{'url'}),$postponedURLs[$a]{'url'},$postponedURLs[$a]{'from'},$postponedURLs[$a]{'line'});
- }
-
- }
-
- sub parseArgs{
- while (@args)
- {
- my $foo=pop @args;
- if ($foo eq "-dp")
- {$postponeURLs=0;}
- elsif ($foo eq "-v")
- {$verbose=1;}
- elsif ($foo eq "-ep")
- {$easyparse=1;}
- elsif ($foo eq "-r")
- {$recurse=1;}
- elsif ($foo=~/--totaltimeout=(.*)/)
- {$totaltimeout=$1;}
- elsif ($foo=~/--connecttimeout=(.*)/)
- {$connecttimeout=$1;}
- elsif ($foo=~/--userAgent=(.*)/)
- {$userAgent=$1;}
- elsif ($foo=~/--defaultFiles=(.*)/)
- {@defaultFiles=split /,/, $1, -64;}
- elsif ($foo eq "--skipComments")
- {$skipComments=1;}
- elsif ($foo eq "--lineNumbers")
- {$showLineNumbers=1;}
- elsif ($foo eq "--emailLinks")
- {$showEmailLinks=1;}
- elsif ($foo=~/^-./)
- {
- print <<" END";
- Invalid Argument. Options are:
- [-dp]: don't postpone external links
- [-v]: verbose mode
- [-ep]: easy parse mode
- [-r]: recursive checking
- [--skipComments]: skip links in HTML comments
- [--emailLinks]: show email links
- [--totaltimeout=*]: total timeout
- [--connecttimeout=*]: connect timeout
- [--userAgent=*]: user agent
- [--defaultFiles=*]: default files [comma separated]
- END
- exit 0;
- }
- else
- {$startloc=$foo;}
- }
-
- if (!@defaultFiles)
- {push @defaultFiles, ("index.html","index.htm","default.html","default.htm","index.shtml","default.shtml");}
- if (!$startloc)
- {mortallyWounded("starting location","unspecified",__LINE__);}
-
- if (isURL($startloc))
- {
- if ($startloc=~m|(.*)/$|)
- {
- $rootdir=$1;
- }
- elsif (isRecursableURL($startloc))
- {
- $startloc=~m|^(.*)/.*$|;
- $rootdir=$1;
- }
- elsif (isRecursableURL("$startloc/"))
- {
- $rootdir=$startloc;
- $startloc="$startloc/";
- }
- else
- {mortallyWounded("starting location","not recursable",__LINE__);}
- }
- else
- {
- if ($startloc=~m|(.*)/$|)
- {
- $rootdir=$1;
- $startloc=replaceDirWithDefaultFile($startloc);
- if ($startloc=~m|/$|)
- {
- # if replaceDir can't find a matching default file,
- # we set the starting location to somethine we KNOW will fail
- $startloc.="filethatdoesn'texist.html";
- }
- }
- elsif (isRecursableFile($startloc))
- {
- $startloc=~m|^(.*)/.*$|;
- $rootdir=$1;
- }
- elsif (isRecursableFile(replaceDirWithDefaultFile("$startloc/")))
- {
- $rootdir=$startloc;
- $startloc=replaceDirWithDefaultFile("$startloc/");
- }
- else
- {mortallyWounded("starting location","not recursable",__LINE__);}
- }
-
-
- return;
- }
-
-
- sub isURL($){
- my $tempURL=shift;
- return 1 if ($tempURL=~m#^(https?|ftp)://#);
- return 0;
- }
-
- sub combinePaths($$$){
- my $to=shift;
- my $from=shift;
- my $hasbase=shift;
- my $result;
-
- $to=~s|^\./||;
- if ($hasbase)
- {$to=~s|^$hasbase|/|;}
- else
- {$to=~s|^$rootdir||;}
-
- if (isURL($to) or $to=~/^mailto:/)
- {$result=$to;}
- elsif ($to=~/\.\.\//)
- {
- my $numofmatches;
- $numofmatches=($to=~s|\.\./||g);
- $numofmatches=0 if $numofmatches eq "";
- if ($from=~m|(.+/)(.+/){$numofmatches}.*$|)
- {$result="$1$to";print "$1 + $to\n" if $DEBUG==2;}
- }
- elsif ($to=~m|^/|)
- {
- if ($hasbase)
- {
- $to=~s|^/||;
- $result="$hasbase$to";
- }
- else
- {$result="$rootdir$to";}
- }
- else
- {
- $from=~m|(.+/).*$|;
- $result="$1$to";
- }
-
- debugMessage("combinePathsResult","$result",__LINE__) if $DEBUG;
-
- return $result;
- }
-
-
-
- sub printResult($$$$) {
- my $type=shift;
- my $to=shift;
- my $from=shift;
- my $line=shift;
- my $string;
-
- $from=~s/$rootdir//;
- $to=~s/$rootdir//;
-
- my $isurl=isURL($to);
-
- $string.="::: " if ($easyparse);
-
-
- if ($type==$linkStatus{"success"} && $verbose)
- {
- if ($isurl)
- {
- if ($easyparse) {$string.="ext+";}
- else {$string.=" external link exists";}
- }
- else
- {
- if ($easyparse) {$string.="int+";}
- else {$string.=" internal link exists";}
- }
- }
- elsif ($type==$linkStatus{"failure"})
- {
- if ($isurl && $to!~/$rootdir/)
- {
- if ($easyparse) {$string.="ext-";}
- else {$string.=sprintf "%22s","external link failed";}
- }
- else
- {
- if ($easyparse) {$string.="int-";}
- else {$string.=sprintf "%22s","internal link failed";}
- }
- }
- elsif ($type==$linkStatus{"email"}&& $showEmailLinks)
- {
- if ($easyparse) {$string.="e";}
- else {$string.=sprintf "%22s","can't check email link";}
- }
- elsif ($type==$linkStatus{"protocolError"})
- {
- if ($easyparse) {$string.="up";}
- else {$string.=sprintf "%22s","unsupported protocol";}
- }
- elsif ($type==$linkStatus{"forbidden"})
- {
- if ($easyparse) {$string.="f";}
- else {$string.=sprintf "%22s","forbidden";}
- }
- elsif ($type==$linkStatus{"timeout"})
- {
- if ($easyparse) {$string.="to";}
- else {$string.=sprintf "%22s","timed out";}
- }
- else
- {return;}
-
- if ($easyparse) {$string.=" :: $to :: $from";}
- else {$string.=" : $to in $from";}
-
- if ($showLineNumbers)
- {
- if ($easyparse) {$string.=" :: $line :::\n";}
- else {$string.=" line $line\n";}
- }
- else
- {
- if ($easyparse) {$string.=" :::\n";}
- else {$string.="\n";}
- }
-
- print $string;
-
- }
-
- sub replaceDirWithDefaultFile($){
- my $file=shift;
- if ($file=~m#(.*)/$#)
- {
- my $a;
- for ($a=0;$a<scalar @defaultFiles;$a++)
- {
- if (-e "$file$defaultFiles[$a]")
- {
- $file="$file$defaultFiles[$a]";
- last;
- }
- }
- }
- return $file;
- }
-
- sub isRecursableURL($){
- my $URL=shift;
- return 1 if ($URL=~m#((\.s?html?)|/|(\.(asp|jsp|\w?cgi\d?|pl|php\d?|woa))(\?.+)?|(\?.*))$#i);
- return 0;
- }
-
- sub isRecursableFile($){
- my $file=shift;
- return 1 if ($file=~m#(\.s?html?)|/$#i);
- return 0;
- }
-
-
- sub mortallyWounded($$$){
- my $category=shift;
- my $subcategory=shift;
- my $line=shift;
- my $string;
-
- if ($easyparse)
- {$string.="::: fatal :: $category :: $subcategory";}
- else
- {$string.=sprintf("%22s : $category $subcategory","fatal");}
-
- if ($showLineNumbers)
- {
- if ($easyparse)
- {$string.=" :: $line :::\n";}
- else
- {$string.=" line $line\n";}
- }
- else
- {
- if ($easyparse)
- {$string.=" :::\n";}
- else
- {$string.="\n";}
- }
- print $string;
- exit 0;
- }
-
- sub debugMessage($$$)
- {
- my $category=shift;
- my $subcategory=shift;
- my $line=shift;
- my $string;
-
- if ($easyparse)
- {$string.="::: debug :: $category :: $subcategory";}
- else
- {$string.=sprintf("%22s : %22s ===> $subcategory","debug",$category);}
-
- if ($showLineNumbers)
- {
- if ($easyparse)
- {$string.=" :: $line :::\n";}
- else
- {$string.=" line $line\n";}
- }
- else
- {
- if ($easyparse)
- {$string.=" :::\n";}
- else
- {$string.="\n";}
- }
- print $string;
- }